#!/usr/local/bin/python3
# -*- coding: UTF-8 -*-
"""
@author:apple
@file:demo01.py
@time:2022/02/24
"""

import requests
import re
import json
import os
from lxml import etree

"""
爬取豆瓣电影Top250
"""

url = 'https://movie.douban.com/top250?start='

headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'
}

data = []


def main():
	number = [i for i in range(0, 230, 25)]
	for i in number:
		page_url = url + str(i)
		r = requests.get(page_url, headers=headers)
		parse_data(r.text)
	save_data()


def parse_data(text):
	html = etree.HTML(text)
	li_list = html.xpath('//ol[@class="grid_view"]/li')
	for li in li_list:
		url = ''.join(li.xpath('./div/div[@class="pic"]/a/@href'))
		pic = ''.join(li.xpath('./div/div[@class="pic"]/a/img/@src'))
		title = ''.join(li.xpath('./div/div[@class="info"]/div[@class="hd"]/a/span[1]/text()'))
		rating_num = ''.join(li.xpath('./div/div[@class="info"]/div[@class="bd"]/div/span[2]/text()'))
		comment_num = ''.join(li.xpath('./div/div[@class="info"]/div[@class="bd"]/div/span[last()]/text()'))
		info = ''.join(li.xpath('./div/div[@class="info"]/div[@class="bd"]/p/text()'))

		info = re.sub(r'\s', '', info)
		film = {
			'title': title,
			'url': url,
			'pic': pic,
			'rating_num': rating_num,
			'commment_num': comment_num,
			'info': info
		}
		print(film)
		data.append(film)


def save_data():
	if not os.path.exists('./result/'):
		os.makedirs('./result/')
	with open('./result/doubanTop250.json', 'w', encoding='utf-8') as f:
		json.dump(data, f, indent=4, ensure_ascii=False)


if __name__ == '__main__':
	main()


